# -*- coding:utf-8 -*-
import jieba
import nltk
import sys
import csv

reload(sys)
sys.setdefaultencoding('utf-8')

stopwords = [line.strip() for line in open("./lagou_data/filter_word_utf8.txt", 'r').readlines()]

with open('./lagou_data/description_data.txt', 'r') as f:
    content = f.read().lower()
    s = content.decode('gbk')
    seg_list = jieba.cut(s)
    s = nltk.FreqDist(seg_list)
    new_s = sorted(s.items(), key=lambda d: d[1])
    new_li = []
    # 遍历的是出现频率最高的词
    for i in new_s[-250:]:
        # print i
        new_li.append(i)

    choice_li = []
    for i in new_li:
        # i = i.decode('')
        if i[0] not in stopwords:
            if i[0] != '\t' and i[0] != '\n':
                k = list(i)
                k[0] = k[0].decode('utf-8').encode('gbk')
                choice_li.append(k)

    csv_file = file("./lagou_data/top_255_words.csv", "w")
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['keyword', 'frequency'])
    csv_writer.writerows(choice_li)
    csv_file.close()
